NOTE Before starting this assignment please remember to clear your environment, you can do that by running the following code chunk
rm(list = ls(all=TRUE))
NOTE Be careful with moving back and forth the various sections in this assignment as we will be building a lot of models and unexpected things might happen if you don’t carefully handle the objects in your global environment
Read in the data
Data Pre-processing
Build a linear SVM model
Do cross validation for finding the optimal C value
Build SVM with Kernels
Report Metrics of the various Models on Test Data
# change your working directory using the "setwd()" function, if your dataset is located elsewhere
cancer_data <- read.csv("cancer_diagnosis.csv")
str(cancer_data)
## 'data.frame': 569 obs. of 32 variables:
## $ id : int 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
## $ Cancer: int 1 1 1 1 1 1 1 1 1 1 ...
## $ V3 : num 18 20.6 19.7 11.4 20.3 ...
## $ V4 : num 10.4 17.8 21.2 20.4 14.3 ...
## $ V5 : num 122.8 132.9 130 77.6 135.1 ...
## $ V6 : num 1001 1326 1203 386 1297 ...
## $ V7 : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ V8 : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ V9 : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ V10 : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ V11 : num 0.242 0.181 0.207 0.26 0.181 ...
## $ V12 : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ V13 : num 1.095 0.543 0.746 0.496 0.757 ...
## $ V14 : num 0.905 0.734 0.787 1.156 0.781 ...
## $ V15 : num 8.59 3.4 4.58 3.44 5.44 ...
## $ V16 : num 153.4 74.1 94 27.2 94.4 ...
## $ V17 : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ V18 : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ V19 : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ V20 : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ V21 : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ V22 : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ V23 : num 25.4 25 23.6 14.9 22.5 ...
## $ V24 : num 17.3 23.4 25.5 26.5 16.7 ...
## $ V25 : num 184.6 158.8 152.5 98.9 152.2 ...
## $ V26 : num 2019 1956 1709 568 1575 ...
## $ V27 : num 0.162 0.124 0.144 0.21 0.137 ...
## $ V28 : num 0.666 0.187 0.424 0.866 0.205 ...
## $ V29 : num 0.712 0.242 0.45 0.687 0.4 ...
## $ V30 : num 0.265 0.186 0.243 0.258 0.163 ...
## $ V31 : num 0.46 0.275 0.361 0.664 0.236 ...
## $ V32 : num 0.1189 0.089 0.0876 0.173 0.0768 ...
The dataset has 569 observations with 32 variables, the descriptions of the variables are given below :
id : Unique identification number of the sample
Cancer : This column represents whether the patient has a benign/normal tumor (0) or a cancerous one (“1”)
The remaining 30 variables are real valued measurements some of which are given below:
head(cancer_data)
## id Cancer V3 V4 V5 V6 V7 V8 V9 V10
## 1 842302 1 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710
## 2 842517 1 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017
## 3 84300903 1 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790
## 4 84348301 1 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520
## 5 84358402 1 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430
## 6 843786 1 12.45 15.70 82.57 477.1 0.12780 0.17000 0.1578 0.08089
## V11 V12 V13 V14 V15 V16 V17 V18 V19
## 1 0.2419 0.07871 1.0950 0.9053 8.589 153.40 0.006399 0.04904 0.05373
## 2 0.1812 0.05667 0.5435 0.7339 3.398 74.08 0.005225 0.01308 0.01860
## 3 0.2069 0.05999 0.7456 0.7869 4.585 94.03 0.006150 0.04006 0.03832
## 4 0.2597 0.09744 0.4956 1.1560 3.445 27.23 0.009110 0.07458 0.05661
## 5 0.1809 0.05883 0.7572 0.7813 5.438 94.44 0.011490 0.02461 0.05688
## 6 0.2087 0.07613 0.3345 0.8902 2.217 27.19 0.007510 0.03345 0.03672
## V20 V21 V22 V23 V24 V25 V26 V27 V28 V29
## 1 0.01587 0.03003 0.006193 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119
## 2 0.01340 0.01389 0.003532 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416
## 3 0.02058 0.02250 0.004571 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504
## 4 0.01867 0.05963 0.009208 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869
## 5 0.01885 0.01756 0.005115 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000
## 6 0.01137 0.02165 0.005082 15.47 23.75 103.40 741.6 0.1791 0.5249 0.5355
## V30 V31 V32
## 1 0.2654 0.4601 0.11890
## 2 0.1860 0.2750 0.08902
## 3 0.2430 0.3613 0.08758
## 4 0.2575 0.6638 0.17300
## 5 0.1625 0.2364 0.07678
## 6 0.1741 0.3985 0.12440
tail(cancer_data)
## id Cancer V3 V4 V5 V6 V7 V8 V9
## 564 926125 1 20.92 25.09 143.00 1347.0 0.10990 0.22360 0.31740
## 565 926424 1 21.56 22.39 142.00 1479.0 0.11100 0.11590 0.24390
## 566 926682 1 20.13 28.25 131.20 1261.0 0.09780 0.10340 0.14400
## 567 926954 1 16.60 28.08 108.30 858.1 0.08455 0.10230 0.09251
## 568 927241 1 20.60 29.33 140.10 1265.0 0.11780 0.27700 0.35140
## 569 92751 0 7.76 24.54 47.92 181.0 0.05263 0.04362 0.00000
## V10 V11 V12 V13 V14 V15 V16 V17 V18
## 564 0.14740 0.2149 0.06879 0.9622 1.026 8.758 118.80 0.006399 0.04310
## 565 0.13890 0.1726 0.05623 1.1760 1.256 7.673 158.70 0.010300 0.02891
## 566 0.09791 0.1752 0.05533 0.7655 2.463 5.203 99.04 0.005769 0.02423
## 567 0.05302 0.1590 0.05648 0.4564 1.075 3.425 48.55 0.005903 0.03731
## 568 0.15200 0.2397 0.07016 0.7260 1.595 5.772 86.22 0.006522 0.06158
## 569 0.00000 0.1587 0.05884 0.3857 1.428 2.548 19.15 0.007189 0.00466
## V19 V20 V21 V22 V23 V24 V25 V26 V27
## 564 0.07845 0.02624 0.02057 0.006213 24.290 29.41 179.10 1819.0 0.14070
## 565 0.05198 0.02454 0.01114 0.004239 25.450 26.40 166.10 2027.0 0.14100
## 566 0.03950 0.01678 0.01898 0.002498 23.690 38.25 155.00 1731.0 0.11660
## 567 0.04730 0.01557 0.01318 0.003892 18.980 34.12 126.70 1124.0 0.11390
## 568 0.07117 0.01664 0.02324 0.006185 25.740 39.42 184.60 1821.0 0.16500
## 569 0.00000 0.00000 0.02676 0.002783 9.456 30.37 59.16 268.6 0.08996
## V28 V29 V30 V31 V32
## 564 0.41860 0.6599 0.2542 0.2929 0.09873
## 565 0.21130 0.4107 0.2216 0.2060 0.07115
## 566 0.19220 0.3215 0.1628 0.2572 0.06637
## 567 0.30940 0.3403 0.1418 0.2218 0.07820
## 568 0.86810 0.9387 0.2650 0.4087 0.12400
## 569 0.06444 0.0000 0.0000 0.2871 0.07039
cancer_data$Cancer <- as.factor(cancer_data$Cancer)
cancer_data <- cancer_data[ , !(colnames(cancer_data) %in% "id")]
sum(is.na(cancer_data))
## [1] 0
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(1234)
index_train <- createDataPartition(cancer_data$Cancer, p = 0.7, list = F)
pre_train <- cancer_data[index_train, ]
pre_test <- cancer_data[-index_train, ]
Standardize all the real valued variables in the dataset as it provides numerical stability to the svm solution
Let’s use the preProcess() function from the caret package to standardize the variables, using just the data points in the training data
std_method <- preProcess(pre_train, method = c("center", "scale"))
train_data <- predict(std_method, pre_train)
test_data <- predict(std_method, pre_test)
library(e1071)
model_svm <- svm(Cancer ~ . , train_data, kernel = "linear")
summary(model_svm)
##
## Call:
## svm(formula = Cancer ~ ., data = train_data, kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
## gamma: 0.03333333
##
## Number of Support Vectors: 34
##
## ( 18 16 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
Now, let’s create a sampling strategy using the trainControl() function and use the train() function from the caret package to get the best value of C
One way to tune models, is first using an exponential search space and then doing a more refined search near the optimal area
library(caret)
sampling_strategy <- trainControl(method = "repeatedcv", number = 4, repeats = 10)
svm_rough_model_c <- train(Cancer ~ . , train_data, method = "svmLinear",
tuneGrid = data.frame(.C = c(10^-4, 10^-3, 10^-2, 10^-1, 10^1, 10^2, 10^3)), trControl = sampling_strategy)
svm_rough_model_c
## Support Vector Machines with Linear Kernel
##
## 399 samples
## 30 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (4 fold, repeated 10 times)
## Summary of sample sizes: 299, 299, 299, 300, 299, 300, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 1e-04 0.6265688 0.0000000
## 1e-03 0.9351042 0.8561251
## 1e-02 0.9616655 0.9160153
## 1e-01 0.9699333 0.9348073
## 1e+01 0.9579204 0.9099957
## 1e+02 0.9426147 0.8779566
## 1e+03 0.9421122 0.8769344
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.1.
svm_fine_model_c <- train(Cancer ~ . , train_data, method = "svmLinear",
tuneGrid = data.frame(.C = c(10^-0.25, 10^-0.5, 10^-0.75, 10^-1, 10^-1.25, 10^-1.5, 10^-1.75)), trControl = sampling_strategy, metric = "Accuracy")
svm_fine_model_c
## Support Vector Machines with Linear Kernel
##
## 399 samples
## 30 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (4 fold, repeated 10 times)
## Summary of sample sizes: 300, 298, 300, 299, 299, 299, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.01778279 0.9621560 0.9173392
## 0.03162278 0.9656662 0.9251594
## 0.05623413 0.9671688 0.9285161
## 0.10000000 0.9706739 0.9362963
## 0.17782794 0.9684263 0.9315755
## 0.31622777 0.9699188 0.9349187
## 0.56234133 0.9701687 0.9356765
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.1.
Hence, from the above cross validation experiment, we can choose the C parameter that gives us the best cross validation accuracy
You might see a slightly different result due to the randomness that arises from the sampling process in cross validation
Let’s measure the performance of our optimized svm on the test data
preds_svm <- predict(model_svm, test_data)
preds_svm_optimized <- predict(svm_fine_model_c, test_data)
confusionMatrix(preds_svm, test_data$Cancer)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 106 4
## 1 1 59
##
## Accuracy : 0.9706
## 95% CI : (0.9327, 0.9904)
## No Information Rate : 0.6294
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9363
## Mcnemar's Test P-Value : 0.3711
##
## Sensitivity : 0.9907
## Specificity : 0.9365
## Pos Pred Value : 0.9636
## Neg Pred Value : 0.9833
## Prevalence : 0.6294
## Detection Rate : 0.6235
## Detection Prevalence : 0.6471
## Balanced Accuracy : 0.9636
##
## 'Positive' Class : 0
##
confusionMatrix(preds_svm_optimized, test_data$Cancer)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 106 3
## 1 1 60
##
## Accuracy : 0.9765
## 95% CI : (0.9409, 0.9936)
## No Information Rate : 0.6294
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9492
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.9907
## Specificity : 0.9524
## Pos Pred Value : 0.9725
## Neg Pred Value : 0.9836
## Prevalence : 0.6294
## Detection Rate : 0.6235
## Detection Prevalence : 0.6412
## Balanced Accuracy : 0.9715
##
## 'Positive' Class : 0
##
But the problem is that transforming the data to higher dimensions is computationally exhaustive
So, that is where the kernel trick comes in.
For that we have to formulate our machine learning problem in terms of the dot product
We can explore various kernel functions to compute the dot product in higher dimensions to find the maximum margin linear classifying boundary in the higher dimension, without transforming our data into a higher dimension space.
We can access various non linear kernels from the kernlab package
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
svm_poly <- ksvm(Cancer ~ . , train_data, kernel = "polydot")
## Setting default kernel parameters
svm_poly
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Polynomial kernel function.
## Hyperparameters : degree = 1 scale = 1 offset = 1
##
## Number of Support Vectors : 34
##
## Objective Function Value : -19.1402
## Training error : 0.012531
svm_rough_model_poly <- train(Cancer ~ . , train_data, method = "svmPoly",
tuneGrid = expand.grid(.C = c(10^-3, 10^-2.5, 10^-2.1, 10^-1.5, 10^-1.2, 10^-0.6), .degree = c(2, 3, 5), .scale = c(0.15, 0.25, 1)), trControl = sampling_strategy)
svm_rough_model_poly
## Support Vector Machines with Polynomial Kernel
##
## 399 samples
## 30 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (4 fold, repeated 10 times)
## Summary of sample sizes: 299, 299, 300, 299, 300, 299, ...
## Resampling results across tuning parameters:
##
## C degree scale Accuracy Kappa
## 0.001000000 2 0.15 0.8150245 0.5581616
## 0.001000000 2 0.25 0.8601466 0.6753691
## 0.001000000 2 1.00 0.9265471 0.8354901
## 0.001000000 3 0.15 0.8729297 0.7070163
## 0.001000000 3 0.25 0.8967381 0.7649931
## 0.001000000 3 1.00 0.9576232 0.9067306
## 0.001000000 5 0.15 0.9057536 0.7869676
## 0.001000000 5 0.25 0.9350749 0.8560248
## 0.001000000 5 1.00 0.9275719 0.8427975
## 0.003162278 2 0.15 0.9015058 0.7767312
## 0.003162278 2 0.25 0.9182891 0.8164829
## 0.003162278 2 1.00 0.9531156 0.8971167
## 0.003162278 3 0.15 0.9112712 0.7998761
## 0.003162278 3 0.25 0.9428478 0.8731265
## 0.003162278 3 1.00 0.9578907 0.9083634
## 0.003162278 5 0.15 0.9380800 0.8628431
## 0.003162278 5 0.25 0.9463479 0.8817992
## 0.003162278 5 1.00 0.9275719 0.8427975
## 0.007943282 2 0.15 0.9323172 0.8490830
## 0.007943282 2 0.25 0.9491130 0.8874232
## 0.007943282 2 1.00 0.9578681 0.9081786
## 0.007943282 3 0.15 0.9463555 0.8811685
## 0.007943282 3 0.25 0.9538782 0.8982812
## 0.007943282 3 1.00 0.9506228 0.8934009
## 0.007943282 5 0.15 0.9473605 0.8839499
## 0.007943282 5 0.25 0.9458678 0.8822469
## 0.007943282 5 1.00 0.9275719 0.8427975
## 0.031622777 2 0.15 0.9601408 0.9125990
## 0.031622777 2 0.25 0.9623984 0.9177640
## 0.031622777 2 1.00 0.9503752 0.8935262
## 0.031622777 3 0.15 0.9628960 0.9186824
## 0.031622777 3 0.25 0.9633910 0.9200291
## 0.031622777 3 1.00 0.9501303 0.8923528
## 0.031622777 5 0.15 0.9533781 0.8989187
## 0.031622777 5 0.25 0.9423576 0.8754137
## 0.031622777 5 1.00 0.9275719 0.8427975
## 0.063095734 2 0.15 0.9631560 0.9194793
## 0.063095734 2 0.25 0.9658984 0.9259107
## 0.063095734 2 1.00 0.9458475 0.8839879
## 0.063095734 3 0.15 0.9669036 0.9277308
## 0.063095734 3 0.25 0.9686561 0.9319603
## 0.063095734 3 1.00 0.9501303 0.8923528
## 0.063095734 5 0.15 0.9519080 0.8961085
## 0.063095734 5 0.25 0.9421076 0.8748559
## 0.063095734 5 1.00 0.9275719 0.8427975
## 0.251188643 2 0.15 0.9741636 0.9441371
## 0.251188643 2 0.25 0.9661383 0.9273733
## 0.251188643 2 1.00 0.9295741 0.8496511
## 0.251188643 3 0.15 0.9701737 0.9355837
## 0.251188643 3 0.25 0.9571355 0.9080002
## 0.251188643 3 1.00 0.9501303 0.8923528
## 0.251188643 5 0.15 0.9486455 0.8892942
## 0.251188643 5 0.25 0.9421076 0.8748559
## 0.251188643 5 1.00 0.9275719 0.8427975
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were degree = 2, scale = 0.15 and C
## = 0.2511886.
preds_svm_poly <- predict(svm_rough_model_poly, test_data)
confusionMatrix(preds_svm_poly, test_data$Cancer)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 106 3
## 1 1 60
##
## Accuracy : 0.9765
## 95% CI : (0.9409, 0.9936)
## No Information Rate : 0.6294
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9492
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.9907
## Specificity : 0.9524
## Pos Pred Value : 0.9725
## Neg Pred Value : 0.9836
## Prevalence : 0.6294
## Detection Rate : 0.6235
## Detection Prevalence : 0.6412
## Balanced Accuracy : 0.9715
##
## 'Positive' Class : 0
##
library(kernlab)
svm_poly <- ksvm(Cancer ~ . , train_data, kernel = "rbfdot")
svm_rough_model_rbf <- train(Cancer ~ . , train_data, method = "svmRadial",
tuneGrid = expand.grid(.C = c(10^1), .sigma = c(10^3, 10^4, 10^5, 10^8, 10^-5, 10^-10, 10^-15)))
svm_rough_model_rbf
## Support Vector Machines with Radial Basis Function Kernel
##
## 399 samples
## 30 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 399, 399, 399, 399, 399, 399, ...
## Resampling results across tuning parameters:
##
## sigma Accuracy Kappa
## 1e-15 0.6306332 0.0000000
## 1e-10 0.6306332 0.0000000
## 1e-05 0.7653462 0.4230114
## 1e+03 0.6306332 0.0000000
## 1e+04 0.6306332 0.0000000
## 1e+05 0.6306332 0.0000000
## 1e+08 0.6306332 0.0000000
##
## Tuning parameter 'C' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 1e-05 and C = 10.